library(tidyverse)
library(rvest)
library(knitr)
library(leaflet)
#install.packages("rgdal",repos = "http://cran.us.r-project.org")
library(rgdal)
library(lubridate)
#library(dplyr)
#library(plyr)
library(plotly)
col1 = "#d8e1cf"
col2 = "#438484"
theme_set(theme_minimal() + theme(legend.position = "bottom"))
options(
ggplot2.continuous.colour = "viridis",
ggplot2.continuous.fill = "viridis"
)
scale_colour_discrete = scale_colour_viridis_d
scale_fill_discrete = scale_fill_viridis_d
Load the historic and year-to-datedatasets of NYPD shooting incident
shooting_initial =
read_csv("./data/NYPD_Shooting.csv") %>% janitor::clean_names()
## Rows: 23585 Columns: 19
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): OCCUR_DATE, BORO, LOCATION_DESC, PERP_AGE_GROUP, PERP_SEX, PERP_R...
## dbl (7): INCIDENT_KEY, PRECINCT, JURISDICTION_CODE, X_COORD_CD, Y_COORD_CD...
## lgl (1): STATISTICAL_MURDER_FLAG
## time (1): OCCUR_TIME
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
shooting_2021 = read_csv("./data/NYPD_shooting_New.csv") %>% janitor::clean_names()
## Rows: 1531 Columns: 19
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): OCCUR_DATE, BORO, LOCATION_DESC, PERP_AGE_GROUP, PERP_SEX, PERP_R...
## dbl (5): INCIDENT_KEY, PRECINCT, JURISDICTION_CODE, Latitude, Longitude
## lgl (1): STATISTICAL_MURDER_FLAG
## time (1): OCCUR_TIME
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#A variable name in shooting_new is different from the initial data, change column name in order to merge the data frames
shooting_2021 = shooting_2021 %>%
rename(lon_lat = new_georeferenced_column)
shooting = rbind(shooting_initial, shooting_2021)
check null
shooting %>%
summarise_all(~ sum(is.na(.)))
## # A tibble: 1 × 19
## incident_key occur_date occur_time boro precinct jurisdiction_code
## <int> <int> <int> <int> <int> <int>
## 1 0 0 0 0 0 3
## # … with 13 more variables: location_desc <int>, statistical_murder_flag <int>,
## # perp_age_group <int>, perp_sex <int>, perp_race <int>, vic_age_group <int>,
## # vic_sex <int>, vic_race <int>, x_coord_cd <int>, y_coord_cd <int>,
## # latitude <int>, longitude <int>, lon_lat <int>
For col boro
shooting = shooting %>%
mutate(boro = as.factor(boro)) %>%
mutate(location_desc = replace_na(location_desc, "NONE")) %>%
mutate(location_desc = as.factor(location_desc)) %>%
separate(occur_date, into = c("month", "day", "year")) %>%
mutate(month = as.numeric(month)) %>%
arrange(year, month) %>%
# mutate(month = month.name[month]) %>%
mutate(year = as.character(year)) %>%
mutate(boro = tolower(boro)) %>%
mutate(boro = if_else(boro == "staten island", "staten_island", boro)) %>%
rename(borough = boro) %>%
mutate(date = str_c(month, day, year, sep = "/")) %>%
select(incident_key, date, everything())
Next, clean the COVID-19 case count data
Importing COVID-19 case count data
covid_counts = read.csv("./data/COVID19_data.csv", sep = ";") %>% as_tibble()
The clean dataset contains only day-by-day COVID-19 case count for each borough and the total case count in NYC of a particular day.
clean_covid = covid_counts %>%
janitor::clean_names() %>%
rename(date = date_of_interest) %>%
select(date, contains("case_count")) %>%
select(-contains(c("probable_case_count", "case_count_7day_avg", "all_case_count_7day_avg"))) %>%
separate(date, into = c("month", "day", "year")) %>%
mutate_all(as.character) %>%
mutate_if(is.character, gsub, pattern = ",", replacement = "") %>%
mutate_if(is.character, as.numeric) %>%
pivot_longer(
cols = bx_case_count:si_case_count,
names_to = "borough",
values_to = "borough_case_count"
) %>%
mutate(borough = gsub("_case_count", "", borough)) %>%
mutate(borough = recode(borough, "bx" = "bronx","bk" = "brooklyn","mn" = "manhattan","si" = "staten_island","qn" = "queens")) %>%
relocate(case_count, .after = borough_case_count) %>%
rename(total_case_count = case_count) %>%
mutate(date = str_c(month, day, year, sep = "/")) %>%
select(date, everything())
head(clean_covid)
## # A tibble: 6 × 7
## date month day year borough borough_case_count total_case_count
## <chr> <dbl> <dbl> <dbl> <chr> <dbl> <dbl>
## 1 2/29/2020 2 29 2020 bronx 0 1
## 2 2/29/2020 2 29 2020 brooklyn 0 1
## 3 2/29/2020 2 29 2020 manhattan 1 1
## 4 2/29/2020 2 29 2020 queens 0 1
## 5 2/29/2020 2 29 2020 staten_island 0 1
## 6 3/1/2020 3 1 2020 bronx 0 0
shooting_heatmap = shooting_initial %>%
mutate(occur_date = as.Date(occur_date,'%m/%d/%Y')) %>%
mutate(occur_date = weekdays(occur_date)) %>%
separate(occur_time, into = c("hour", "minute", "second")) %>%
mutate(hour = as.factor(hour)) %>%
select(incident_key, occur_date, hour) %>%
mutate(occur_date = as.factor(occur_date),
occur_date = fct_relevel(occur_date, "Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday" , "Saturday"))
dayHour = plyr::ddply(shooting_heatmap, c( "hour", "occur_date"), summarise, N = length(incident_key))
attach(dayHour)
heatmap = ggplot(dayHour, aes(hour, occur_date)) +
geom_tile(aes(fill = N),colour = "white", na.rm = TRUE) +
scale_fill_gradient(low = col1, high = col2) +
guides(fill = guide_legend(title = "Total Shooting Cases")) +
theme_bw() +
theme_minimal() +
labs(title = "Time Based Heatmap",
x = "Shooting Cases Per Hour", y = "Day of Week") +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())
heatmap

According to the result of this heatmap, the midnight of weekends(Sunday and Saturday) have the highest risk of shooting cases. Additionally, daytime between 7 in the morning and 19 in the evening seems to have lower shooting cases than the other time of the day.
shooting_year = shooting %>%
group_by(year) %>%
summarise(n_obs = n())
#visualization shooting incidence trend
shooting_year %>%
plot_ly( x = ~year, y = ~n_obs, type = "scatter", mode = "lines+markers") %>%
layout(title = "Shooting Incidence Trend from 2006 to 2020",
xaxis = list(title = "Year"),
yaxis = list(title = "Frequency"))
shooting_month = shooting %>%
mutate(month = as.factor(month)) %>%
group_by(month) %>%
summarise(n_obs = n())
shooting_month %>%
plot_ly(x = ~month, y = ~n_obs, color = ~month, type = "bar") %>%
layout( title = "The Distribution of Shooting Incidence by Month",
xaxis = list(title = "Month"),
yaxis = list(title = "Frequency"))
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
shooting_time = shooting %>%
##format the occur_time variable to only hours.
mutate(occur_time_hour = format(as.POSIXct(occur_time), format = "%H")) %>%
mutate(occur_time_hour = as.numeric(occur_time_hour)) %>%
group_by(occur_time_hour) %>%
summarise(case_numb = n())
#divide day time to 4 groups: 0-6;6-12;12-18;18-24
shooting_time = shooting_time %>%
mutate(occur_time_range = case_when(
occur_time_hour >= 0 & occur_time_hour < 6 ~ "0-6",
occur_time_hour >= 6 & occur_time_hour < 12 ~ "6-12",
occur_time_hour >= 12 & occur_time_hour < 18 ~ "12-18",
occur_time_hour >= 18 & occur_time_hour < 24 ~ "18-24"))
shooting_time = shooting_time %>%
mutate(occur_time_range = factor(occur_time_range, levels = c("0-6","6-12","12-18","18-24")))
ggplot(shooting_time, aes(x = occur_time_range, y = case_numb, fill = occur_time_range)) + geom_col(alpha = 1) + labs(x = "Occur Time Range",
y = "Frequency",
title = "Distribution of Shooting Case by Day")

shooting_2020 = shooting %>%
filter(year == "2020") %>%
mutate(month = as.factor(month)) %>%
group_by(month) %>%
summarise(n_obs = n())
ggplot(shooting_2020, aes(x = month, y = n_obs, fill = month)) + geom_col(alpha = 1) + labs(
x = "Month",
y = "Frequency",
title = "Distribution of Shooting Case by Day in NYC")

This is an interactive map of shooting incidents from 2006 to 2021 in NYC. Incidents’ details will be displayed after clicking the icon. GIS data of Boroughs’ boundaries were obtained from NYC Open Data.
nyc_boro = readOGR("./data/Borough_Boundaries/geo_export_2204bc6b-9c17-46ed-8a67-7245a1e15877.shp", layer = "geo_export_2204bc6b-9c17-46ed-8a67-7245a1e15877")
## Warning in OGRSpatialRef(dsn, layer, morphFromESRI = morphFromESRI, dumpSRS =
## dumpSRS, : Discarded datum WGS84 in Proj4 definition: +proj=longlat +ellps=WGS84
## +no_defs
## OGR data source with driver: ESRI Shapefile
## Source: "/Users/sylvia/Desktop/P8105/gun_violence_nyc.github.io/data/Borough_Boundaries/geo_export_2204bc6b-9c17-46ed-8a67-7245a1e15877.shp", layer: "geo_export_2204bc6b-9c17-46ed-8a67-7245a1e15877"
## with 5 features
## It has 4 fields
shooting %>%
# filter(year %in% c("2020", "2021")) %>%
mutate_at(c("perp_age_group", "perp_sex", "perp_race"), funs(ifelse(is.na(.), "unknown", .))) %>%
mutate(labels = str_c("<b>Incident Key: </b>", incident_key,
"<br>", "<b>Date: </b>", date,
"<br>", "<b>Borough: </b>", borough,
"<br>", "<b>Murdered: </b>", statistical_murder_flag,
"<br>", "<b>Perpetrator's Race: </b>", perp_race,
"<br>", "<b>Victim's Race: </b>", vic_race,
"<br>", "<b>Perpetrator's Age: </b>", perp_age_group,
"<br>", "<b>Victim's Age: </b>", vic_age_group
)) %>%
leaflet() %>%
addTiles() %>%
addProviderTiles("CartoDB.Positron") %>%
addMarkers(lng = ~longitude, lat = ~latitude, popup = ~labels,
clusterOptions = markerClusterOptions()) %>%
addPolygons(data = nyc_boro,
weight = 0.85,
label = ~nyc_boro@data$boro_name)
## Warning: `funs()` was deprecated in dplyr 0.8.0.
## Please use a list of either functions or lambdas:
##
## # Simple named list:
## list(mean = mean, median = median)
##
## # Auto named with `tibble::lst()`:
## tibble::lst(mean, median)
##
## # Using lambdas
## list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.